In this notebook, any specified model which adheres to the sklearn API can be fitted across customisable training windows, with predictions made and plotted for the specified test window. Models are persisted to disk for quickly repeatable runs.
import os
from joblib import dump, load
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
from tscv import GapRollForward
from tqdm.notebook import tqdm
# --- notebook parameters: import, choose model, set hyperparameters
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor, HistGradientBoostingRegressor)
from sklearn.neighbors import KNeighborsRegressor
# specify load and weather data details
DATA_PATH = '../data'
REGIONS = ['nsw', 'sa']
DATA_FILENAME = 'merged.csv'
OBS_PER_DAY = 24
X_EXCLUDE = ['datetime', 'pv_est', 'net_load', 'total_load']
HOLIDAY_FILENAME = 'australian-public-holidays-combined-2021-2024.csv'
# for convenience below
obs_year, obs_week = OBS_PER_DAY * 365, OBS_PER_DAY * 7
# specify methodology and model parameters
TRAIN_BEGIN = '2020-01-01'
TRAIN_MIN_SIZE = obs_year # change for expanding window
TRAIN_MAX_SIZE = obs_year # change for expanding window (np.inf)
TEST_MIN_SIZE = obs_week
TEST_MAX_SIZE = obs_week
TEST_FINAL_N = None # set to None for rolling test window, or n to test final observations
ROLL_SIZE = obs_week
MODEL_SELECTION = 'hgb'
MODELS_DEFINITION = {
'rf': {
'class': RandomForestRegressor,
'kwargs': {'n_jobs': 8}
},
'hgb': {
'class': HistGradientBoostingRegressor,
'kwargs': {} # capable of quantile loss, l2reg
},
'gb': {'class': GradientBoostingRegressor},
'ada': {'class': AdaBoostRegressor},
'knn': {'class': KNeighborsRegressor}
}
# --- end notebook parameters
# prep model and persistence
MODEL = MODELS_DEFINITION[MODEL_SELECTION]['class']
MODEL_TYPE = str(MODEL).split("'")[1].split('.')[-1] # only for plot display
MODEL_KWARGS = MODELS_DEFINITION[MODEL_SELECTION].get('kwargs', {})
MODEL_PERSISTENCE_DIRS = {R: os.path.join('../models', R, MODEL_SELECTION) for R in REGIONS}
for dir in MODEL_PERSISTENCE_DIRS.values():
os.makedirs(dir, exist_ok=True)
# extract holidays from file
holiday_df = pd.read_csv(os.path.join(DATA_PATH, HOLIDAY_FILENAME), dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]')
# import and preprocess data
dfs = {}
for R in REGIONS:
full_data_path = os.path.join(DATA_PATH, R, DATA_FILENAME)
df = pd.read_csv(os.path.relpath(full_data_path))
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['day_of_week'] = dt.day_of_week
df['week'] = dt.isocalendar().week
holidays = holiday_df.loc[holiday_df['Jurisdiction'] == R, ['Date', 'Holiday Name']]
df['holiday'] = pd.merge(dt.floor('D'), holidays, left_on='datetime', right_on='Date', how='left')[
'Holiday Name'].replace(holidays['Holiday Name'].unique(), range(len(holidays['Holiday Name'].unique())))
dfs[R] = df
prdfs = []
for R in REGIONS:
persistence_dir = MODEL_PERSISTENCE_DIRS[R]
df = dfs[R]
# compute X and y column indices
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
X_inds = sorted(df.columns.get_indexer_for(X_cols))
y_ind = df.columns.get_loc('net_load')
# create train/test window strategy
df_subset = df[df.datetime >= TRAIN_BEGIN]
tscv = GapRollForward(min_train_size=TRAIN_MIN_SIZE, max_train_size=TRAIN_MAX_SIZE,
min_test_size=TEST_MIN_SIZE, max_test_size=TEST_MAX_SIZE,
roll_size=ROLL_SIZE)
print(sum(1 for i in tscv.split(df_subset)), f'{R} models to be loaded/trained')
# execute train/test window strategy
for i, (train_ind, test_ind) in tqdm(enumerate(tscv.split(df_subset))):
if TEST_FINAL_N:
test_ind = range(-TEST_FINAL_N, 0)
X_train, X_test = df_subset.iloc[train_ind, X_inds], df_subset.iloc[test_ind, X_inds]
y_train, y_test = df_subset.iloc[train_ind, y_ind], df_subset.iloc[test_ind, y_ind]
# train or load model
begin, end = df_subset.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
argstring = '_'.join([f'{k}={v}' for k, v in MODEL_KWARGS.items()])
x_ind_string = '-'.join([str(x) for x in X_inds])
model_filename = f'{begin}_{OBS_PER_DAY}_{end}_{argstring}_h_{x_ind_string}.joblib'
model_filepath = os.path.join(persistence_dir, model_filename)
try:
model = load(model_filepath)
except FileNotFoundError:
model = MODEL(**MODEL_KWARGS)
model.fit(X_train, y_train)
dump(model, model_filepath)
# predict
prd = model.predict(X_test)
prdf = pd.DataFrame({'datetime': df_subset.iloc[test_ind, 0],
'region': R,
'model': i,
'train_end': end,
'predicted': prd,
'net_load': y_test})
prdfs.append(prdf)
# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2
prediction_summary = predictions.groupby(['region', 'train_end']).describe()
131 nsw models to be loaded/trained
0it [00:00, ?it/s]
113 sa models to be loaded/trained
0it [00:00, ?it/s]
import plotly.express as px
import plotly.offline
plotly.offline.init_notebook_mode()
# massage prediction summary into long-form with desired quartiles
preds = prediction_summary.stack([0, 1]).loc[:, :,
['Residual', 'Absolute Error', 'Percent Error', 'Absolute Percent Error', 'Squared Error'],
['max', '75%', '50%', '25%', 'min']
].reset_index(name='value').rename(
{'level_2': 'metric', 'level_3': 'quartile'}, axis=1)
# compute aggregate metrics
regional_preds = predictions.groupby('region')
mapes = regional_preds['Absolute Percent Error'].mean()
maes = regional_preds['Absolute Error'].mean()
rmses = regional_preds['Squared Error'].mean() ** 0.5
# for pretty aggregate metric display in subplot titles
def subfig_title(a):
facet_name = a.text.split("=")[-1]
if facet_name not in REGIONS:
a.update(text=facet_name)
return
mape, mae, rmse = (x[facet_name] for x in [mapes, maes, rmses])
a.update(text=f'{facet_name} MAPE={mape:.2%} MAE={mae:.1f} RMSE={rmse:.1f}')
fig = px.line(preds, x='train_end', y='value', color='quartile', facet_col='region',
facet_row='metric', title=MODEL_TYPE, height=800, template='ggplot2',
facet_col_spacing=0.08)
fig.update_yaxes(matches=None, title='', showticklabels=True)
fig.for_each_annotation(subfig_title)
fig.show()